PM566-Lab 4

Author

Gowri

1. Read in the data

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)   # also needed for ggplot functions


if (!file.exists("met_all.gz"))
  download.file(
    url = "https://raw.githubusercontent.com/USCbiostats/data-science-data/master/02_met/met_all.gz",
    destfile = "met_all.gz",
    method   = "libcurl",
    timeout  = 60
    )
met <- data.table::fread("met_all.gz")

2. Prepare the data

# remove temperatures less than -17c
# make sure there is no missing data in key variables coded as 9999, 99, etc 
met <- met[temp > -17][elev == 9999.0, elev := NA]

#GENERATE A DATE VARIABLE USING FUNCTIONS as.Date() (hint: you will need the following to create a date paste(year,month,day,sep="-"))
met[,week :=(as.Date(paste(year,month,day,sep = "-")))]

#using the date.table::week functin, keep the observations of the first week of the month \
met <- met[week == min(week, na.rm=TRUE)]

#compute the mean by station of the variables temp, rh,wind.sp, vis.dist, dew.point, lat, and elev
met_avg <- met[,.(temp=mean(temp,na.rm=TRUE), rh=mean(rh,na.rm=TRUE),
               wind.sp=mean(wind.sp,na.rm=TRUE),
               dew.point = mean(dew.point, na.rm=TRUE),
               lat=mean(lat), lon=mean(lon),
               elev= mean(elev,na.rm=TRUE)), by="USAFID"]
#create a region variable for NW,SW,NE based on lon=-98.00 and lat= 39.71 degree
met_avg$region <- ifelse(met_avg$lon > -98 & met_avg$lat >39.71, "northeast",
                         ifelse(met_avg$lon > -98 & 
                                  met_avg$lat < 39.71,"south east",
                                ifelse(met_avg$lon < -98 & met_avg$lat >39.71,
                                       "north west", "south west")))
#Create a actegorical variable for evealuation as in the lecture slides 
met_avg$elev_cat <- ifelse(met_avg$elev>252,"high", "low")

table(met_avg$region)

north west  northeast south east south west 
       146        484        648        296 

3. Use geom_violin to examine the wind speed and dew point by region

#Description The violin plots reveal distinct regional patterns in wind seppd distribuitions. The north east shows symmetruc distribution with most clustred at 3m/s, while the northeast region displays variable distribution. The southeast exhibits less variablity , and the south west region demonstrates windspreds woith the greated variability compared to the rest. Overall, the north east region appears to have the highest wind speeds, while the southwest region shows the most concentrated distribution aorunf 3 m/s

met_avg %>%
  filter(!(region %in% NA)) %>% #make sure to deal with NAs
  ggplot()+
 
  facet_wrap(~region, nrow=2) #use facets

met_avg %>%
  filter(!(dew.point %in% NA)) %>% #make sure to deal with NAs
  ggplot()+
  geom_violin(mapping = aes (y=dew.point, x=1)) +
  facet_wrap(~region, nrow=2) #use facets

met_avg %>%
  filter(!(region %in% NA)) %>%
  ggplot() +
  geom_boxplot(mapping = aes(y=rh,fill=region)) +
  facet_wrap(~region,nrow=2)

4. Use geom_jitter with stat_smooth to examine the association between dew point and wind speed by region

Description The scatter plot demonstrates a relationship between dew point and relative humidity across all regions. All regional regression lines show positive slopes, indicating that as dew point increases, relative humidity also increases. The relationship appears strongest in the North West and North East regions, while the South East and South West regions show a more moderate association. This pattern makes meteorological sense because higher dew points indicate more moisture in the air, which directly raises relative humidity levels.

met_avg %>%
  filter(!(dew.point %in% NA)) %>% #make sure to deal with NAs
  ggplot(mapping = aes(x=dew.point,y=rh, colour = region)) + #color paoints by region 
 geom_jitter() +
  stat_smooth(method = lm) # fit a linerar regression line by region 
`geom_smooth()` using formula = 'y ~ x'

met_avg %>%
  filter(!(dew.point %in% NA)) %>% #make sure to deal with NAs
  ggplot(mapping = aes(x=dew.point,y= wind.sp, colour = region)) + #color paoints by region 
 geom_jitter() +
  stat_smooth(method = lm) # fit a linerar regression line by region 
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 16 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 16 rows containing missing values or values outside the scale range
(`geom_point()`).

met_avg %>%
  filter(!is.na(dew.point) & !is.na(wind.sp)) %>% #make sure to deal with NAs
  ggplot(mapping = aes(x=dew.point,y=rh, colour = region)) + #color paoints by region 
 geom_jitter() +
  stat_smooth(method = lm) # fit a linerar regression line by region 
`geom_smooth()` using formula = 'y ~ x'

5. Use geom_bar to create barplots of the weather stations by elevation category colored by regionmet

Description The bar chart reveals significant differences in weather station distribution across regions and elevation categories. The South East region has the highest number of stations overall, concentrated at low elevations. The North East region shows more stations at high elevations. The North West region has the fewest total stations but shows a higher proportion located in high elevations. The South West region has a moderate number of stations, with more located at high elevations. This distribution pattern suggests that regional geography strongly influences where weather stations are placed.

met_avg %>%
  filter(!(elev_cat %in% NA)) %>% # make sure to deal with NAvalues 
  ggplot()+
  geom_bar(mapping = aes(x=elev_cat,fill=region), position = "dodge") + # bars by elevation category using position ="dodge"
           scale_fill_brewer(palette = "PuOr") + # change colors from teh default. color by region usign scale_fill_brewer see this 
             labs(title = "Number of weather stationsn by elevation category and region",
                  x="Elevation Category", y= "Count") + #count nice labels on the axes and add a title 
             theme_bw()

6. Use stat_summary to examine mean dew point and wind speed by region with standard deviation error bars

Description The region with the highest mean dew point is the South East. The North East displays the lowest mean dew point. The error bars reveal that the South West region has the most variable conditions, indicating a wider range of dew point values compared to other regions. Regional differences in wind speed are compared to dew point patterns. The North West and South West regions show higher mean wind speeds around 3 m/s, indicating greater variability in wind conditions. The North East and South East regions display lower mean wind speeds,with less varibaility.

met_avg %>%
  filter(!is.na(dew.point) & !is.na(wind.sp)) %>%
  ggplot(mapping = aes(x=region, y=dew.point)) +
  stat_summary(fun.data = "mean_sdl", geom = "errorbar")# add anothe rlayer of stats_summary but change the heom to errorbar

stat_summary(fun.data = "mean_sdl") # use fun.data="mean_sdl" in summary
geom_pointrange: na.rm = FALSE, orientation = NA
stat_summary: fun.data = mean_sdl, fun = NULL, fun.max = NULL, fun.min = NULL, fun.args = list(), na.rm = FALSE, orientation = NA
position_identity 
met_avg %>%
  filter(!(region %in% NA)) %>%
  ggplot(mapping = aes(x=region, y=wind.sp)) +
  stat_summary(fun.data = "mean_sdl", geom = "errorbar") +
  stat_summary(fun.data = "mean_sdl")
Warning: Removed 16 rows containing non-finite outside the scale range
(`stat_summary()`).
Removed 16 rows containing non-finite outside the scale range
(`stat_summary()`).

7. Make a map showing the spatial trend in relative humidity in the US

Description The relative humidity map reveals a clear east–west gradient across the United States. The Eastern regions show higher relative humidity values (70–90%), represented by purple and red colors. The central regions display moderate humidity levels (40–60%), while the Western regions exhibit lower values (20–40%), shown in blue. The top 10 highest relative humidity locations are predominantly located in the eastern and southeastern United States, . This pattern demonstrates how geographys affect regional humidity distributions.

library(leaflet)
met_avg2<-met_avg[!is.na(rh)]

top10 <- met_avg2[rank(-rh) <= 10]

rh_pal = colorNumeric(c('blue','purple','red'), domain=met_avg2$rh)

leaflet(met_avg2) %>% 
  addProviderTiles('OpenStreetMap') %>%
  addCircles(lat=~lat, lng=~lon, color=~rh_pal(rh),
             label=~paste0(round(rh,2),'rh'), opacity=1,fillOpacity=1, radius=500) %>%
  addMarkers(lat=~lat, lng=~lon, label=~paste0(round(rh,2), 'rh'), data=top10) %>%
  addLegend('bottomleft', pal=rh_pal, values=met_avg2$rh,
            title="Relative Humidity", opacity=1)

8. Use a ggplot extension - ridgeline plots of temperature distributions by region and elevation

Description The ridgeline plots show clear temperature differences by region and elevation. At high elevations, the North West region has the coolest temperatures around 18 °C, while the South West region shows the warmest temperature. At low elevations, all regions are warmer than their high-elevation counterparts, with the South East and South West regions reaching 25–26 °C. The North East region maintains the most consistent temperatures across both elevation categories, while the South West region shows different patterns between high and low elevations. This demonstrates that elevation strongly influences regional temperature distributions, with low-elevation areas generally experiencing higher and more variable temperatures.

library(ggridges)

library(gganimate)
No renderer backend detected. gganimate will default to writing frames to separate files
Consider installing:
- the `gifski` package for gif output
- the `av` package for video output
and restarting the R session
met_avg %>%
  filter(!(temp %in% NA)) %>% 
  ggplot(mapping = aes(x=temp, y=region, fill = region)) +
  geom_density_ridges(alpha = 0.7) +
  facet_wrap(~elev_cat, nrow = 2) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Temperature Distributions by Region and Elevation Category", 
       x = "Temperature (°C)",
       y = "Region",
       fill="Region") +
  theme_bw() +
  theme(legend.position = "bottom")
Picking joint bandwidth of 0.944
Picking joint bandwidth of 1.46